import struct
import os
import numpy as np
import pandas as pd
import random
import plotly.graph_objects as go
import math
COMP_FILE = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng/sample_embeddings_001825.comparisons"
COMP_FILE_Q001 = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_001825.comparisons"
COMP_FILE_Q001_04 = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01_margin0.4/sample_embeddings_q0.01_margin0.4_002281.comparisons"
COMP_FILE_Q0001 = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_002281.comparisons"
COMP_FILE_Q001_LSTM40_3LAYERS = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_lstm40_3layers_002281.comparisons"
COMP_FILE_Q001_BIG = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_big_002281.comparisons"
COMP_FILE_Q0001_BIG = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_002281.comparisons"
COMP_FILE_Q0001_BIG_LSTM40_3LAYER = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_002281.comparisons"
COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M04 = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_margin0.4_002281.comparisons"
COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M05 = "/media/eduseiti/Seagate Expansion Drive/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_margin0.5_002281.comparisons"
COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M048 = "/mnt/f633ac7c-3153-4566-a009-229a0ae5f8a1/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001_margin0.48/sample_embeddings_q0.001_big_lstm40_3layer_margin0.48_002281.comparisons"
BASE_PVALUE_FOLDER="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue"
COMP_ALL_PVALUE_10="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_002281.comparisons"
COMP_ALL_PVALUE_10_LOG_SCALING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_log_scaling_002281.comparisons"
COMP_ALL_PVALUE_10_TEST="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_test_002281.comparisons"
COMP_ALL_PVALUE_10_WINSORIZING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_winsorizing_002281.comparisons"
COMP_ALL_PVALUE_10_IDENTIFICATIONS_FIX="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_identifications_fix_002281.comparisons"
COMP_ALL_PVALUE_10_CELL_STATE="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_002281.comparisons"
COMP_ALL_PVALUE_10_CELL_STATE_NO_WINSORIZING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_no_winsorizing_002281.comparisons"
STRUCT_FIELDS = "BIBId"
def decode_comparisons_file(comparisons_filename):
comparisons = []
with open(comparisons_filename, "rb") as inputFile:
while True:
record = inputFile.read(struct.calcsize(STRUCT_FIELDS))
if not record:
break
else:
unpacked = struct.unpack_from(STRUCT_FIELDS, record)
comparisons.append(unpacked)
if math.isnan(unpacked[4]):
print("nan: {}".format(record))
print("Decoded {} comparisons from {}".format(len(comparisons), comparisons_filename))
return np.array(comparisons)
def plot_comparissons_histogram(comparisons_filename):
comparisons = decode_comparisons_file(comparisons_filename)
comparisons_df = pd.DataFrame(comparisons, columns = ["file_1", "scannr_1", "file_2", "scannr_2", "cosine_similarity"])
print(comparisons_df['cosine_similarity'].describe(percentiles=list(np.round(np.arange(0.0, 1.0, 0.05), 2))))
cosSim_histogram, costSim_bin_edges = np.histogram(comparisons_df['cosine_similarity'].loc[list(random.sample(range(len(comparisons)), int(len(comparisons) * 0.1)))], 1000)
fig = go.Figure()
fig.add_trace(go.Bar(y=cosSim_histogram,
x=costSim_bin_edges[1:],
marker_color='red'))
fig.show()
return comparisons_df, cosSim_histogram, costSim_bin_edges
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_CELL_STATE_NO_WINSORIZING))
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_CELL_STATE))
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_IDENTIFICATIONS_FIX))
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_WINSORIZING))
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_TEST))
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_LOG_SCALING))
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10))
_, embeddings_hist, _ = plot_comparissons_histogram(COMP_FILE)
_, embeddings_q001_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001)
_, embeddings_q0001_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001)
_, embeddings_q001_04_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_04)
_, embeddings_q001_lstm40_3layers_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_LSTM40_3LAYERS)
_, embeddings_q001_big_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_BIG)
_, embeddings_q0001_big_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG)
_, embeddings_q0001_big_lstm40_3layer_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER)
_, embeddings_q0001_big_lstm40_3layer_margin04_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M04)
_, embeddings_q0001_big_lstm40_3layer_margin05_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M05)
_, embeddings_q0001_big_lstm40_3layer_margin048_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M048)